library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
library(usmap)
library(lmtest)
library(pracma)
library(lmtest)
library(forecast)
library(vars)
library(rvest)
library(RSelenium)
library(seleniumPipes)
library(dLagM)
library(jsonlite)
library(rgdal)
library(esri2sf)
library(readr)
options(
tigris_class = "sf",
tigris_use_cache = TRUE
)
Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")
# Load the processing functions. This also loads the normalization functions.
source("safegraph_process_patterns_functions.R")
# SET your path here to the covid19analysis folder.
sg_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/'
sg_hourly_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/weekly-patterns/v2/hourly/'
Trying to load hourly visits data.
# # load visits data
# bay_origins_hourly_2_24 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-02-24_origins_hourly.rds"))
#
# bay_origins_hourly_3_02 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-03-02_origins_hourly.rds"))
#
# bay_origins_hourly_3_09 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-03-09_origins_hourly.rds"))
#
# bay_origins_hourly_3_16 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-03-16_origins_hourly.rds"))
#
# bay_origins_hourly_3_23 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-03-23_origins_hourly.rds"))
#
# bay_origins_hourly_3_30 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-03-30_origins_hourly.rds"))
#
# bay_origins_hourly_4_06 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-04-06_origins_hourly.rds"))
#
# bay_origins_hourly_4_13 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-04-13_origins_hourly.rds"))
#
# bay_origins_hourly_4_20 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-04-20_origins_hourly.rds"))
#
# bay_origins_hourly_4_27 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-04-27_origins_hourly.rds"))
#
# bay_origins_hourly_5_04 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-05-04_origins_hourly.rds"))
#
# bay_origins_hourly_5_11 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-05-11_origins_hourly.rds"))
#
# bay_origins_hourly_5_18 <- readRDS(paste0(sg_hourly_path, "bay_patterns_2020-05-18_origins_hourly.rds"))
# # try loading the SJ data as a test
# sj_origins_hourly_3_09 <- readRDS(paste0(sg_path, "weekly-patterns/v2/sj_patterns_2020-03-09_origins_hourly.rds"))
#
# bay_origins_hourly_05_11 <- readRDS(paste0(sg_hourly_path, "y.rds_day_2020-05-11.rds"))
Just looking at visits data on a weekly level - this took too long to run too so I didn’t do it.
# poi_ca <- readRDS(paste0(sg_path,'ca_poi.rds'))
#
# # function to load weekly patterns
# load_weekly_patterns <- function(week) {
#
# patterns <-
# readRDS(paste0(sg_path,'weekly-patterns/v2/main-file/',week,'-weekly-patterns-ca.rds'))
#
# hps <-
# readRDS(paste0(sg_path,'weekly-patterns/v2/home-summary-file/',week,'-home-panel-summary.rds'))
#
# patterns <-
# patterns %>%
# left_join(
# poi_ca %>%
# dplyr::select(
# safegraph_place_id,
# longitude,
# latitude,
# naics_code
# ),
# by = "safegraph_place_id"
# ) %>%
# filter(!is.na(longitude)) %>%
# mutate(
# long = longitude,
# lat = latitude
# )
#
# patterns_norm <-
# normBG(patterns,hps)
#
# return(patterns_norm)
# }
#
#
# # try loading for SF and Alameda counties starting in March
# # first get block groups
# sfc_blockgroups <-
# block_groups("CA","San Francisco", cb=F, progress_bar=F) %>%
# st_transform('+proj=longlat +datum=WGS84')
#
# ac_blockgroups <-
# block_groups("CA","Alameda", cb=F, progress_bar=F) %>%
# st_transform('+proj=longlat +datum=WGS84')
#
# sfc_bgs <- sfc_blockgroups %>% pull(GEOID)
#
# ac_bgs <- ac_blockgroups %>% pull(GEOID)
#
# sfc_weekly_patterns <- NULL
# ac_weekly_patterns <- NULL
#
# for (i in 1:12) {
# week_init <- as.Date("2020-03-02")
# week <- week_init + 7*(i-1)
# week_patterns <- load_weekly_patterns(week)
# sfc_week_patterns <- week_patterns %>% filter(origin_census_block_group %in% sfc_bgs)
# saveRDS(sfc_week_patterns, paste0(sg_path,"weekly-patterns/v2/sfc_patterns_",week,"_norm.rds"))
# ac_week_patterns <- week_patterns %>% filter(origin_census_block_group %in% ac_bgs)
# saveRDS(ac_week_patterns, paste0(sg_path,"weekly-patterns/v2/ac_patterns_",week,"_norm.rds"))
#
# # add to existing patterns
# sfc_weekly_patterns <- rbind(sfc_weekly_patterns, sfc_week_patterns)
# ac_weekly_patterns <- rbind(ac_weekly_patterns, ac_week_patterns)
#
# }
Load SFC and Alameda case data.
# block groups
sf_blockgroups <-
block_groups("CA","San Francisco", cb=F, progress_bar=F) %>%
st_transform('+proj=longlat +datum=WGS84')
ac_blockgroups <-
block_groups("CA","Alameda", cb=F, progress_bar=F) %>%
st_transform('+proj=longlat +datum=WGS84')
sf_bgs <- sf_blockgroups %>% pull(GEOID)
ac_bgs <- ac_blockgroups %>% pull(GEOID)
# zip code areas
zctas_94 <-
zctas(cb = F, starts_with = "94") %>%
st_transform('+proj=longlat +datum=WGS84') %>%
dplyr::select(ZCTA5CE10, geometry)
zctas_95 <-
zctas(cb = F, starts_with = "95") %>%
st_transform('+proj=longlat +datum=WGS84') %>%
dplyr::select(ZCTA5CE10, geometry)
zctas_94_95 <- rbind(zctas_94, zctas_95)
# join with block groups
sf_bg_zctas <- sf_blockgroups %>%
st_centroid() %>%
st_join(zctas_94_95) %>%
dplyr::select(GEOID, ZCTA5CE10) %>%
rename(blockgroup = GEOID, zipcode = ZCTA5CE10)
ac_bg_zctas <- ac_blockgroups %>%
st_centroid() %>%
st_join(zctas_94_95) %>%
dplyr::select(GEOID, ZCTA5CE10) %>%
rename(blockgroup = GEOID, zipcode = ZCTA5CE10)
# get SF case data
sf_place_cases <-
read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv") %>%
filter(county == 'San Francisco')
# get population data for San Francisco
acs_vars = readRDS("/Users/simonespeizer/CEE 218Z/censusData2018_acs_acs5.rds")
# define a function for pulling census data
pullCensus <- function(variableToPull, county) {
regionString <- paste0("state:06+county:", county)
censusData <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = regionString,
vars = variableToPull
) %>%
mutate(blockgroup = paste0(state,county,tract,block_group)) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME"))
return(censusData)
}
sf_fips <- fips("CA", "San Francisco") %>% substr(3,5)
sf_pop_zip <- pullCensus("B01003_001E", sf_fips) %>%
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_pop_zip = sum(B01003_001E))
sf_cases_zip <- sf_place_cases %>% left_join(sf_pop_zip, by = c("place" = "zipcode")) %>%
mutate(cases_by_pop = confirmed_cases / total_pop_zip) %>%
rename(zipcode = place)
# get Alameda County case data - downloaded manually
ac_place_cases <- read.csv("Simone_Speizer/Alameda_County_Cumulative_Cases_By_Place_And_Zip.csv")
ac_cases_zip <- ac_place_cases %>%
rename(date = DtCreate) %>%
mutate(date = date %>% substr(1,10) %>% as.Date()) %>%
dplyr::select(c(date, contains("F9"))) %>% # only select zip code data
gather(key = "zipcode", value = "cases", -date) %>%
mutate(cases = ifelse(cases == "<10", "5", cases),
zipcode = zipcode %>% substr(2,6)) %>% # replace cases <10 with 10 and remove the "F" from zipcode names
mutate(cases = as.numeric(cases))
# get Alameda County populations by zip code
ac_fips <- fips("CA", "Alameda") %>% substr(3,5)
ac_pop_zip <- pullCensus("B01003_001E", ac_fips) %>%
left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_pop_zip = sum(B01003_001E))
ac_cases_zip <- ac_cases_zip %>% left_join(ac_pop_zip) %>%
mutate(cases_by_pop = cases / total_pop_zip)
# plot Alameda County case data
ac_cases_zip %>% plot_ly() %>%
add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
layout(title = "Alameda County")
# also plot SF for comparison
sf_cases_zip %>% plot_ly() %>%
add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
layout(title = "San Francisco County")
# plot Alameda and SF together
combined_cases_zip <- rbind(cbind(ac_cases_zip, county_name = "Alameda") %>% dplyr::select(date, cases_by_pop, zipcode, county_name), cbind(sf_cases_zip, county_name = "San Francisco") %>% dplyr::select(date, cases_by_pop, zipcode, county_name))
combined_cases_zip %>% plot_ly() %>%
add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'lines', color = ~zipcode, linetype = ~county_name) %>%
layout(title = "San Francisco and Alameda Counties")
# limit the domain more
combined_cases_zip %>% filter(date >= as.Date("2020-04-01")) %>% plot_ly() %>%
add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'lines', color = ~zipcode, linetype = ~county_name) %>%
layout(title = "San Francisco and Alameda Counties")
SF seems to have leveled off, or maybe data just hasn’t been updated.
Out of curiosity, only plot Alameda County points that are greater than 0.002 cases per person.
ac_cases_zip %>% filter(cases_by_pop >= 0.002) %>% plot_ly() %>%
add_trace(x = ~date, y = ~cases_by_pop, type = 'scatter', mode = 'lines', color = ~zipcode) %>%
layout(title = "Alameda County, filtered to > 0.002 cases/person")
Load social distancing data and plot for zip codes in the two counties.
bay_sd <- readRDS("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/bay_socialdistancing_v2.rds") %>%
mutate(date = date_range_start %>% substr(1,10) %>% as.Date())
# relevant dates
pre_sd_date_cutoff <- as.Date("2020-03-01")
shelter_date <- as.Date("2020-03-17") # note that this is the day the order went into effect
# get daily social distancing data for SF by date and zip code, including mean at home time
# first get block group populations for weighted averages
sf_pop_bg <- pullCensus("B01003_001E", sf_fips) %>%
rename(total_pop = B01003_001E)
ac_pop_bg <- pullCensus("B01003_001E", ac_fips) %>%
rename(total_pop = B01003_001E)
sf_sd_zip_by_date <- bay_sd %>%
filter(origin_census_block_group %in% sf_bgs) %>%
left_join(sf_pop_bg, by = c("origin_census_block_group" = "blockgroup")) %>%
left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode, date) %>%
summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count), not_home_time_wtavg = weighted.mean(mean_non_home_dwell_time, total_pop), home_time_wtavg = weighted.mean(mean_home_dwell_time, total_pop), median_not_home_time_wtavg = weighted.mean(median_non_home_dwell_time, total_pop), median_home_time_wtavg = weighted.mean(median_home_dwell_time, total_pop)) %>%
mutate(percent_at_home = total_at_home_zip*100/total_devices,
percent_leaving_home = (100 - percent_at_home)) %>%
ungroup() %>%
left_join(sf_pop_zip)
# get daily social distancing data for Alameda County by date
ac_sd_zip_by_date <- bay_sd %>%
filter(origin_census_block_group %in% ac_bgs) %>%
left_join(ac_pop_bg, by = c("origin_census_block_group" = "blockgroup")) %>%
left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
group_by(zipcode, date) %>%
summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count), not_home_time_wtavg = weighted.mean(mean_non_home_dwell_time, total_pop), home_time_wtavg = weighted.mean(mean_home_dwell_time, total_pop), median_not_home_time_wtavg = weighted.mean(median_non_home_dwell_time, total_pop), median_home_time_wtavg = weighted.mean(median_home_dwell_time, total_pop)) %>%
mutate(percent_at_home = total_at_home_zip*100/total_devices,
percent_leaving_home = (100 - percent_at_home)) %>%
ungroup() %>%
left_join(ac_pop_zip)
# plot percent leaving home
sf_sd_zip_by_date %>% plot_ly() %>%
add_trace(x = ~date, y = ~percent_leaving_home, color = ~zipcode, type = 'scatter', mode = 'lines') %>% layout(title = "San Francisco County")
ac_sd_zip_by_date %>% plot_ly() %>%
add_trace(x = ~date, y = ~percent_leaving_home, color = ~zipcode, type = 'scatter', mode = 'lines') %>% layout(title = "Alameda County")
# plot mean of median time outside of home
sf_sd_zip_by_date %>% plot_ly() %>%
add_trace(x = ~date, y = ~median_not_home_time_wtavg, color = ~zipcode, type = 'scatter', mode = 'lines') %>% layout(title = "San Francisco County, Zip Code Weighted Average Median Time Outside Home")
ac_sd_zip_by_date %>% plot_ly() %>%
add_trace(x = ~date, y = ~median_not_home_time_wtavg, color = ~zipcode, type = 'scatter', mode = 'lines') %>% layout(title = "Alameda County, Zip Code Weighted Average of Median Time Outside Home")
# plot mean time outside of home
sf_sd_zip_by_date %>% plot_ly() %>%
add_trace(x = ~date, y = ~not_home_time_wtavg, color = ~zipcode, type = 'scatter', mode = 'lines') %>% layout(title = "San Francisco County")
ac_sd_zip_by_date %>% plot_ly() %>%
add_trace(x = ~date, y = ~not_home_time_wtavg, color = ~zipcode, type = 'scatter', mode = 'lines') %>% layout(title = "Alameda County")
Note that zip codes 94720, 94704, and 94613 look like extreme outliers in these plots, especially in the time not at home, even before COVID-19 but especially during COVID-19 times. Let’s map these.
mapview(zctas_94 %>% filter(ZCTA5CE10 == "94720" | ZCTA5CE10 == "94704" | ZCTA5CE10 == "94613"))
These zip codes are associated with universities (Berkeley and Mills College) so maybe that is why they are skewing the results. I will remove them for further analysis.
ac_sd_zip_by_date_edit <- ac_sd_zip_by_date %>% filter(zipcode != "94720" & zipcode != "94704" & zipcode != "94613")
ac_sd_zip_by_date_edit %>% plot_ly() %>%
add_trace(x = ~date, y = ~median_not_home_time_wtavg, color = ~zipcode, type = 'scatter', mode = 'lines') %>% layout(title = "Alameda County, Zip Code Weighted Average of Median Time Outside Home, Extreme Outliers Removed")
That’s a little more reasonable.
Plot the overall trends for the two counties
# plot overall for the two counties
sf_sd_zip_total <- sf_sd_zip_by_date %>% group_by(date) %>%
summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices), median_not_home_time_wtavg_tot = weighted.mean(median_not_home_time_wtavg, total_pop_zip), not_home_time_wtavg_tot = weighted.mean(not_home_time_wtavg, total_pop_zip)) %>%
mutate(percent_at_home = total_at_home*100/total_devices,
percent_leaving_home = (100 - percent_at_home)) %>%
ungroup()
ac_sd_zip_total <- ac_sd_zip_by_date_edit %>% group_by(date) %>%
summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices), median_not_home_time_wtavg_tot = weighted.mean(median_not_home_time_wtavg, total_pop_zip), not_home_time_wtavg_tot = weighted.mean(not_home_time_wtavg, total_pop_zip)) %>%
mutate(percent_at_home = total_at_home*100/total_devices,
percent_leaving_home = (100 - percent_at_home)) %>%
ungroup()
sf_ac_total <- left_join(sf_sd_zip_total %>% dplyr::select(percent_leaving_home, not_home_time_wtavg_tot, median_not_home_time_wtavg_tot, date) %>% rename(sf_percent_leaving = percent_leaving_home, sf_not_home_time_wtavg = not_home_time_wtavg_tot, sf_median_not_home_time_wtavg = median_not_home_time_wtavg_tot), ac_sd_zip_total %>% dplyr::select(percent_leaving_home, not_home_time_wtavg_tot, median_not_home_time_wtavg_tot, date) %>% rename(ac_percent_leaving = percent_leaving_home, ac_not_home_time_wtavg = not_home_time_wtavg_tot, ac_median_not_home_time_wtavg = median_not_home_time_wtavg_tot))
# percent leaving
sf_ac_total %>%
plot_ly() %>%
add_trace(x = ~date, y = ~sf_percent_leaving, type = 'scatter', mode = 'lines', name = "San Francisco") %>%
add_trace(x = ~date, y = ~ac_percent_leaving, type = 'scatter', mode = 'lines', name = "Alameda") %>%
layout(title = "San Francisco County vs Alameda County", yaxis = list(title = "percent leaving home"))
# time outside home - average of median
sf_ac_total %>%
plot_ly() %>%
add_trace(x = ~date, y = ~sf_median_not_home_time_wtavg, type = 'scatter', mode = 'lines', name = "San Francisco") %>%
add_trace(x = ~date, y = ~ac_median_not_home_time_wtavg, type = 'scatter', mode = 'lines', name = "Alameda") %>%
layout(title = "San Francisco County vs Alameda County", yaxis = list(title = "Average of median time not at home"))
# time outside home - average of mean
sf_ac_total %>%
plot_ly() %>%
add_trace(x = ~date, y = ~sf_not_home_time_wtavg, type = 'scatter', mode = 'lines', name = "San Francisco") %>%
add_trace(x = ~date, y = ~ac_not_home_time_wtavg, type = 'scatter', mode = 'lines', name = "Alameda") %>%
layout(title = "San Francisco County vs Alameda County", yaxis = list(title = "Average of time not at home"))
Trying to get visits data. For the moment, just aggregating with zip code, not trying to weight, and only keeping separate by date (daily not hourly visits).
NOTE: in this method I just summed all of the hourly visits by each zip code, so this is really counting more like “visit-hours”
# files_to_process <- list.files(paste0(sg_hourly_path, "ByDay/"), pattern = "day")
#
# sj_blockgroups <- readRDS("sj_blockgroups.rds")
#
# # start by just processing the first 7 found here, for one week of data, seeing how long it takes
# sf_daily_visits_zip <- NULL
# ac_daily_visits_zip <- NULL
# sj_daily_visits <- NULL
#
# for (i in 1:7) {
# bay_hourly_visits_curr <- readRDS(paste0(sg_hourly_path, "ByDay/", files_to_process[i]))
#
# ac_daily_visits_curr <- bay_hourly_visits_curr %>% dplyr::select(origin_census_block_group, date, hour, visit_counts_hourly_high, visit_counts_hourly_low) %>%
# filter(origin_census_block_group %in% ac_bgs) %>%
# left_join(ac_bg_zctas %>% as.data.frame() %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
# group_by(zipcode, date) %>%
# summarize(total_visits_high = sum(visit_counts_hourly_high), total_visits_low = sum(visit_counts_hourly_low)) %>%
# mutate(total_visits_avg = (total_visits_high + total_visits_low)/2) %>%
# filter(!is.na(zipcode))
#
# sf_daily_visits_curr <- bay_hourly_visits_curr %>% dplyr::select(origin_census_block_group, date, hour, visit_counts_hourly_high, visit_counts_hourly_low) %>%
# filter(origin_census_block_group %in% sf_bgs) %>%
# left_join(sf_bg_zctas %>% as.data.frame() %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
# group_by(zipcode, date) %>%
# summarize(total_visits_high = sum(visit_counts_hourly_high), total_visits_low = sum(visit_counts_hourly_low)) %>%
# mutate(total_visits_avg = (total_visits_high + total_visits_low)/2) %>% filter(!is.na(zipcode))
#
# sj_daily_visits_curr <- bay_hourly_visits_curr %>% dplyr::select(origin_census_block_group, date, hour, visit_counts_hourly_high, visit_counts_hourly_low) %>%
# filter(origin_census_block_group %in% sj_blockgroups$origin_census_block_group) %>%
# group_by(origin_census_block_group, date) %>%
# summarize(total_visits_high = sum(visit_counts_hourly_high), total_visits_low = sum(visit_counts_hourly_low)) %>%
# mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)
#
# sf_daily_visits_zip = rbind(sf_daily_visits_zip, sf_daily_visits_curr)
#
# ac_daily_visits_zip = rbind(ac_daily_visits_zip, ac_daily_visits_curr)
#
# sj_daily_visits <- rbind(sj_daily_visits, sj_daily_visits_curr)
# }
#
# # it worked! so continue
# for (i in 8:length(files_to_process)) {
# bay_hourly_visits_curr <- readRDS(paste0(sg_hourly_path, "ByDay/", files_to_process[i]))
#
# ac_daily_visits_curr <- bay_hourly_visits_curr %>% dplyr::select(origin_census_block_group, date, hour, visit_counts_hourly_high, visit_counts_hourly_low) %>%
# filter(origin_census_block_group %in% ac_bgs) %>%
# left_join(ac_bg_zctas %>% as.data.frame() %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
# group_by(zipcode, date) %>%
# summarize(total_visits_high = sum(visit_counts_hourly_high), total_visits_low = sum(visit_counts_hourly_low)) %>%
# mutate(total_visits_avg = (total_visits_high + total_visits_low)/2) %>%
# filter(!is.na(zipcode))
#
# sf_daily_visits_curr <- bay_hourly_visits_curr %>% dplyr::select(origin_census_block_group, date, hour, visit_counts_hourly_high, visit_counts_hourly_low) %>%
# filter(origin_census_block_group %in% sf_bgs) %>%
# left_join(sf_bg_zctas %>% as.data.frame() %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
# group_by(zipcode, date) %>%
# summarize(total_visits_high = sum(visit_counts_hourly_high), total_visits_low = sum(visit_counts_hourly_low)) %>%
# mutate(total_visits_avg = (total_visits_high + total_visits_low)/2) %>% filter(!is.na(zipcode))
#
# sj_daily_visits_curr <- bay_hourly_visits_curr %>% dplyr::select(origin_census_block_group, date, hour, visit_counts_hourly_high, visit_counts_hourly_low) %>%
# filter(origin_census_block_group %in% sj_blockgroups$origin_census_block_group) %>%
# group_by(origin_census_block_group, date) %>%
# summarize(total_visits_high = sum(visit_counts_hourly_high), total_visits_low = sum(visit_counts_hourly_low)) %>%
# mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)
#
# sf_daily_visits_zip = rbind(sf_daily_visits_zip, sf_daily_visits_curr)
#
# ac_daily_visits_zip = rbind(ac_daily_visits_zip, ac_daily_visits_curr)
#
# sj_daily_visits <- rbind(sj_daily_visits, sj_daily_visits_curr)
# }
#
# # it worked! save those rds's
# saveRDS(sj_daily_visits, paste0(sg_path, "weekly-patterns/v2/sj_visits_daily_sum_hourly_03-09-20_05-24-20.rds"))
#
# saveRDS(sf_daily_visits_zip, paste0(sg_path, "weekly-patterns/v2/sf_zip_visits_daily_sum_hourly_03-09-20_05-24-20.rds"))
#
# saveRDS(ac_daily_visits_zip, paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_daily_sum_hourly_03-09-20_05-24-20.rds"))
# read data on visits
sf_daily_visits_zip <- readRDS(paste0(sg_path, "weekly-patterns/v2/sf_zip_visits_daily_sum_hourly_03-09-20_05-24-20.rds"))
ac_daily_visits_zip <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_daily_sum_hourly_03-09-20_05-24-20.rds"))
Combine the visits, social distancing data, and case data.
sf_sd_visits_cases_zip_by_date <- left_join(sf_sd_zip_by_date, sf_daily_visits_zip) %>% left_join(sf_cases_zip %>% dplyr::select(-total_pop_zip))
ac_sd_visits_cases_zip_by_date <- left_join(ac_sd_zip_by_date_edit, ac_daily_visits_zip) %>% left_join(ac_cases_zip %>% dplyr::select(-total_pop_zip))
Look at the visits per capita since 3/9, average percent leaving home since 3/9, weighted average of time spent outside of the home, and current levels of cases.
sd_start_date <- as.Date("2020-03-09")
sd_end_date <- as.Date("2020-05-23") # use only data on movement and visits through this date, about a week before max case date
# SF first
sf_sd_visits_cases_zip_curr <- sf_sd_visits_cases_zip_by_date %>%
filter(date >= sd_start_date & date <= sd_end_date) %>%
group_by(zipcode) %>%
summarize(total_at_home = sum(total_at_home_zip),
total_devices = sum(total_devices),
median_not_home_time_wtavg_tot = mean(median_not_home_time_wtavg),
not_home_time_wtavg_tot = mean(not_home_time_wtavg),
total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low)) %>%
mutate(percent_at_home = total_at_home*100/total_devices,
percent_leaving_home = (100 - percent_at_home),
total_visits_avg = (total_visits_high + total_visits_low)/2,
median_frac_day_not_home = (median_not_home_time_wtavg_tot)/(60*24),
mean_frac_day_not_home = (not_home_time_wtavg_tot)/(60*24)) %>%
left_join(sf_cases_zip %>% filter(date == max(date))) %>%
mutate(visits_per_capita = total_visits_avg / total_pop_zip) %>%
filter(!is.na(zipcode) & !is.na(confirmed_cases) & !is.na(visits_per_capita))
# Alameda
ac_sd_visits_cases_zip_curr <- ac_sd_visits_cases_zip_by_date %>%
filter(date >= sd_start_date & date <= sd_end_date) %>%
group_by(zipcode) %>%
summarize(total_at_home = sum(total_at_home_zip),
total_devices = sum(total_devices),
median_not_home_time_wtavg_tot = mean(median_not_home_time_wtavg),
not_home_time_wtavg_tot = mean(not_home_time_wtavg),
total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low)) %>%
mutate(percent_at_home = total_at_home*100/total_devices,
percent_leaving_home = (100 - percent_at_home),
total_visits_avg = (total_visits_high + total_visits_low)/2,
median_frac_day_not_home = (median_not_home_time_wtavg_tot)/(60*24),
mean_frac_day_not_home = (not_home_time_wtavg_tot)/(60*24)) %>%
left_join(ac_cases_zip %>% filter(date == max(date))) %>%
mutate(visits_per_capita = total_visits_avg / total_pop_zip) %>%
filter(!is.na(zipcode) & !is.na(cases) & !is.na(visits_per_capita))
First look at visits per capita.
# San Francisco
sf_sd_visits_cases_zip_curr %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_curr)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 3/9-5/23'), yaxis = list(title = 'Current cases per person'), title = "San Francisco visits/person 3/9-5/23 and current cases/person")
model_sf_visits_cases_curr <- lm(cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_curr)
summary(model_sf_visits_cases_curr)
##
## Call:
## lm(formula = cases_by_pop ~ visits_per_capita, data = sf_sd_visits_cases_zip_curr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0019122 -0.0009646 -0.0005723 0.0007585 0.0033672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.057e-05 1.569e-03 0.026 0.980
## visits_per_capita 5.821e-05 3.988e-05 1.460 0.159
##
## Residual standard error: 0.001692 on 22 degrees of freedom
## Multiple R-squared: 0.08828, Adjusted R-squared: 0.04684
## F-statistic: 2.13 on 1 and 22 DF, p-value: 0.1585
# Alameda
ac_sd_visits_cases_zip_curr %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_curr)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 3/9-5/23'), yaxis = list(title = 'Current cases per person'), title = "Alameda County visits/person 3/9-5/23 and current cases/person")
model_ac_visits_cases_curr <- lm(cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_curr)
summary(model_ac_visits_cases_curr)
##
## Call:
## lm(formula = cases_by_pop ~ visits_per_capita, data = ac_sd_visits_cases_zip_curr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0020139 -0.0010330 -0.0001598 0.0004552 0.0043311
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.296e-03 1.082e-03 -1.198 0.23782
## visits_per_capita 6.034e-05 2.117e-05 2.850 0.00675 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001357 on 42 degrees of freedom
## Multiple R-squared: 0.1621, Adjusted R-squared: 0.1421
## F-statistic: 8.123 on 1 and 42 DF, p-value: 0.006746
Log of cases.
# San Francisco
sf_sd_visits_cases_zip_curr %>% mutate(log_cases_by_pop = log(cases_by_pop)) %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~log_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_curr %>% mutate(log_cases_by_pop = log(cases_by_pop)))), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 3/9-5/23'), yaxis = list(title = 'log(current cases per person)'), title = "San Francisco visits/person 3/9-5/23 and current cases/person")
model_sf_visits_cases_curr_log <- lm(log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_curr %>% mutate(log_cases_by_pop = log(cases_by_pop)))
summary(model_sf_visits_cases_curr_log)
##
## Call:
## lm(formula = log_cases_by_pop ~ visits_per_capita, data = sf_sd_visits_cases_zip_curr %>%
## mutate(log_cases_by_pop = log(cases_by_pop)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1247 -0.5280 -0.0138 0.3915 1.2989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.70651 0.69051 -11.161 1.58e-10 ***
## visits_per_capita 0.03477 0.01755 1.981 0.0603 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.745 on 22 degrees of freedom
## Multiple R-squared: 0.1513, Adjusted R-squared: 0.1128
## F-statistic: 3.923 on 1 and 22 DF, p-value: 0.06027
# Alameda
ac_sd_visits_cases_zip_curr %>% mutate(log_cases_by_pop = log(cases_by_pop)) %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~log_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(log_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_curr %>% mutate(log_cases_by_pop = log(cases_by_pop)))), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 3/9-5/23'), yaxis = list(title = 'log(current cases per person)'), title = "Alameda County visits/person 3/9-5/23 and current cases/person")
model_ac_visits_cases_curr_log <- lm(log_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_curr %>% mutate(log_cases_by_pop = log(cases_by_pop)))
summary(model_ac_visits_cases_curr_log)
##
## Call:
## lm(formula = log_cases_by_pop ~ visits_per_capita, data = ac_sd_visits_cases_zip_curr %>%
## mutate(log_cases_by_pop = log(cases_by_pop)))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.51021 -0.45937 -0.07387 0.48700 1.44347
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.63619 0.56382 -15.317 < 2e-16 ***
## visits_per_capita 0.03912 0.01103 3.547 0.000973 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.707 on 42 degrees of freedom
## Multiple R-squared: 0.2305, Adjusted R-squared: 0.2122
## F-statistic: 12.58 on 1 and 42 DF, p-value: 0.0009726
Now look at median time outside of the home during a day. Note that this is the time-average of the weighted average of the medians for each block group in a zip code.
# San Francisco
sf_sd_visits_cases_zip_curr %>% plot_ly() %>%
add_trace(x = ~median_frac_day_not_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~median_frac_day_not_home, y = fitted(lm(cases_by_pop ~ median_frac_day_not_home, sf_sd_visits_cases_zip_curr)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Fraction of the day spent away from home, 3/9-5/23'), yaxis = list(title = 'Current cases per person'), title = "San Francisco fraction of day away from home 3/9-5/23 and current cases/person")
model_sf_frac_not_home_cases_curr <- lm(cases_by_pop ~ median_frac_day_not_home, sf_sd_visits_cases_zip_curr)
summary(model_sf_frac_not_home_cases_curr)
##
## Call:
## lm(formula = cases_by_pop ~ median_frac_day_not_home, data = sf_sd_visits_cases_zip_curr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.002024 -0.001267 -0.000717 0.001137 0.002960
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0026891 0.0004645 5.789 7.99e-06 ***
## median_frac_day_not_home -0.0245708 0.0182299 -1.348 0.191
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001704 on 22 degrees of freedom
## Multiple R-squared: 0.07628, Adjusted R-squared: 0.03429
## F-statistic: 1.817 on 1 and 22 DF, p-value: 0.1914
# Alameda
ac_sd_visits_cases_zip_curr %>% plot_ly() %>%
add_trace(x = ~median_frac_day_not_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~median_frac_day_not_home, y = fitted(lm(cases_by_pop ~ median_frac_day_not_home, ac_sd_visits_cases_zip_curr)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Fraction of the day spent away from home, 3/9-5/23'), yaxis = list(title = 'Current cases per person'), title = "Alameda fraction of day away from home 3/9-5/23 and current cases/person")
model_ac_frac_not_home_cases_curr <- lm(cases_by_pop ~ median_frac_day_not_home, ac_sd_visits_cases_zip_curr)
summary(model_ac_frac_not_home_cases_curr)
##
## Call:
## lm(formula = cases_by_pop ~ median_frac_day_not_home, data = ac_sd_visits_cases_zip_curr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0014123 -0.0011085 -0.0004587 0.0004134 0.0045836
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0017087 0.0003812 4.482 5.61e-05 ***
## median_frac_day_not_home 0.0014423 0.0183218 0.079 0.938
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001483 on 42 degrees of freedom
## Multiple R-squared: 0.0001475, Adjusted R-squared: -0.02366
## F-statistic: 0.006197 on 1 and 42 DF, p-value: 0.9376
It seems like this metric might not be useful at a zip code level, though maybe could be when looking at overall cases over time.
Percent leaving home:
# San Francisco
sf_sd_visits_cases_zip_curr %>% plot_ly() %>%
add_trace(x = ~percent_leaving_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~percent_leaving_home, y = fitted(lm(cases_by_pop ~ percent_leaving_home, sf_sd_visits_cases_zip_curr)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of devices leaving home, 3/9-5/23'), yaxis = list(title = 'Current cases per person'), title = "San Francisco percent of devices leaving home 3/9-5/23 and current cases/person")
model_sf_perc_leaving_cases_curr <- lm(cases_by_pop ~ percent_leaving_home, sf_sd_visits_cases_zip_curr)
summary(model_sf_perc_leaving_cases_curr)
##
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = sf_sd_visits_cases_zip_curr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0021059 -0.0012014 -0.0006479 0.0008882 0.0032612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0107642 0.0060085 1.792 0.087 .
## percent_leaving_home -0.0001588 0.0001122 -1.415 0.171
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001697 on 22 degrees of freedom
## Multiple R-squared: 0.08346, Adjusted R-squared: 0.0418
## F-statistic: 2.003 on 1 and 22 DF, p-value: 0.171
# Alameda
ac_sd_visits_cases_zip_curr %>% plot_ly() %>%
add_trace(x = ~percent_leaving_home, y = ~cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~percent_leaving_home, y = fitted(lm(cases_by_pop ~ percent_leaving_home, ac_sd_visits_cases_zip_curr)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Percent of devices leaving home, 3/9-5/23'), yaxis = list(title = 'Current cases per person'), title = "Alameda percent of devices leaving home 3/9-5/23 and current cases/person")
model_ac_perc_leaving_cases_curr <- lm(cases_by_pop ~ percent_leaving_home, ac_sd_visits_cases_zip_curr)
summary(model_ac_perc_leaving_cases_curr)
##
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = ac_sd_visits_cases_zip_curr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0024293 -0.0007051 -0.0001447 0.0005321 0.0041798
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.919e-03 2.588e-03 -2.287 0.02728 *
## percent_leaving_home 1.419e-04 4.783e-05 2.966 0.00496 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001348 on 42 degrees of freedom
## Multiple R-squared: 0.1732, Adjusted R-squared: 0.1535
## F-statistic: 8.798 on 1 and 42 DF, p-value: 0.004959
Interesting that Alameda has the relationship we would expect but SF does not.
visits_start_date <- as.Date("2020-04-21")
visits_end_date <- as.Date("2020-05-23")
cases_min_date <- as.Date("2020-04-21")
# SF first
# get initial cases
sf_init_cases <- sf_cases_zip %>% filter(date == cases_min_date) %>%
dplyr::select(zipcode, cases_by_pop) %>%
rename(init_cases_by_pop = cases_by_pop) %>%
mutate(log_init_cases_by_pop = log(init_cases_by_pop))
# summarize and add current and initial cases
sf_sd_visits_cases_zip_change <- sf_sd_visits_cases_zip_by_date %>%
filter(date >= visits_start_date & date <= visits_end_date) %>%
group_by(zipcode) %>%
summarize(total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low)) %>%
left_join(sf_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
visits_per_capita = total_visits_avg / total_pop_zip) %>%
filter(!is.na(zipcode) & !is.na(cases_by_pop) & !is.na(visits_per_capita)) %>%
left_join(sf_init_cases) %>%
mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
change_cases_by_pop = cases_by_pop - init_cases_by_pop)
# Alameda
# get initial cases
ac_init_cases <- ac_cases_zip %>% filter(date == cases_min_date) %>%
dplyr::select(zipcode, cases_by_pop) %>%
rename(init_cases_by_pop = cases_by_pop) %>%
mutate(log_init_cases_by_pop = log(init_cases_by_pop))
# summarize and add current and initial cases
ac_sd_visits_cases_zip_change <- ac_sd_visits_cases_zip_by_date %>%
filter(date >= visits_start_date & date <= visits_end_date) %>%
group_by(zipcode) %>%
summarize(total_visits_high = sum(total_visits_high),
total_visits_low = sum(total_visits_low)) %>%
left_join(ac_cases_zip %>% filter(date == max(date)) %>% mutate(log_cases_by_pop = log(cases_by_pop))) %>%
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2,
visits_per_capita = total_visits_avg / total_pop_zip) %>%
filter(!is.na(zipcode) & !is.na(cases_by_pop) & !is.na(visits_per_capita)) %>%
left_join(ac_init_cases) %>%
mutate(change_log_cases_by_pop = log_cases_by_pop - log_init_cases_by_pop,
change_cases_by_pop = cases_by_pop - init_cases_by_pop)
# plot
# San Francisco
sf_sd_visits_cases_zip_change %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/21-5/23'), yaxis = list(title = 'Change in cases per person from 4/21-present'), title = "San Francisco visits/person and change in cases/person")
model_sf_visits_cases_change <- lm(change_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)
summary(model_sf_visits_cases_change)
##
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = sf_sd_visits_cases_zip_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0010588 -0.0007260 -0.0003609 0.0005375 0.0018409
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.495e-04 8.432e-04 -0.177 0.861
## visits_per_capita 7.147e-05 5.190e-05 1.377 0.182
##
## Residual standard error: 0.0009202 on 22 degrees of freedom
## Multiple R-squared: 0.07934, Adjusted R-squared: 0.03749
## F-statistic: 1.896 on 1 and 22 DF, p-value: 0.1824
# Alameda
ac_sd_visits_cases_zip_change %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/21-5/23'), yaxis = list(title = 'Change in cases per person from 4/21-present'), title = "Alameda visits/person and change in cases/person")
model_ac_visits_cases_change <- lm(change_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)
summary(model_ac_visits_cases_change)
##
## Call:
## lm(formula = change_cases_by_pop ~ visits_per_capita, data = ac_sd_visits_cases_zip_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0012904 -0.0006345 -0.0002643 0.0002761 0.0041796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.008e-03 7.287e-04 -1.383 0.17392
## visits_per_capita 9.529e-05 3.448e-05 2.764 0.00845 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00102 on 42 degrees of freedom
## Multiple R-squared: 0.1539, Adjusted R-squared: 0.1337
## F-statistic: 7.638 on 1 and 42 DF, p-value: 0.008448
Also plot with log of change in cases.
# San Francisco
sf_sd_visits_cases_zip_change %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/21-5/23'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "San Francisco visits/person and change in cases/person")
model_sf_visits_cases_change_log <- lm(change_log_cases_by_pop ~ visits_per_capita, sf_sd_visits_cases_zip_change)
summary(model_sf_visits_cases_change_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = sf_sd_visits_cases_zip_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.60802 -0.21192 -0.06168 0.12410 1.89239
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.666177 0.442526 1.505 0.146
## visits_per_capita -0.007165 0.027241 -0.263 0.795
##
## Residual standard error: 0.4829 on 22 degrees of freedom
## Multiple R-squared: 0.003134, Adjusted R-squared: -0.04218
## F-statistic: 0.06917 on 1 and 22 DF, p-value: 0.795
# Alameda
ac_sd_visits_cases_zip_change %>%
plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/21-5/23'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "Alameda visits/person and change in cases/person")
model_ac_visits_cases_change_log <- lm(change_log_cases_by_pop ~ visits_per_capita, ac_sd_visits_cases_zip_change)
summary(model_ac_visits_cases_change_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = ac_sd_visits_cases_zip_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52640 -0.29252 -0.07594 0.21008 1.08346
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.10397 0.28981 -0.359 0.7216
## visits_per_capita 0.03805 0.01371 2.775 0.0082 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4057 on 42 degrees of freedom
## Multiple R-squared: 0.155, Adjusted R-squared: 0.1348
## F-statistic: 7.702 on 1 and 42 DF, p-value: 0.008199
Plot both SF and Alameda on the same graph.
plot_ly() %>%
add_trace(x = ac_sd_visits_cases_zip_change$visits_per_capita, y = ac_sd_visits_cases_zip_change$change_log_cases_by_pop, type = 'scatter', mode = 'markers', name = "Alameda") %>%
add_trace(x = sf_sd_visits_cases_zip_change$visits_per_capita, y = sf_sd_visits_cases_zip_change$change_log_cases_by_pop, type = 'scatter', mode = 'markers', name = "San Francisco") %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/21-5/23'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "Visits/person and change in cases/person")
That does actually seem more like a trend might be emerging? Fit a trendline to that.
ac_sf_sd_visits_cases_zip_change <- rbind(ac_sd_visits_cases_zip_change %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita), sf_sd_visits_cases_zip_change %>% dplyr::select(zipcode, change_log_cases_by_pop, visits_per_capita))
ac_sf_sd_visits_cases_zip_change %>% plot_ly() %>%
add_trace(x = ~visits_per_capita, y = ~change_log_cases_by_pop, type = 'scatter', mode = 'markers') %>%
add_trace(x = ~visits_per_capita, y = fitted(lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change)), mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = 'Total visit-hours per person, 4/21-5/23'), yaxis = list(title = 'change in log(cases per person) from 4/21-present'), title = "Visits/person and change in cases/person, SF and Alameda")
model_combined_visits_cases_change_log <- lm(change_log_cases_by_pop ~ visits_per_capita, ac_sf_sd_visits_cases_zip_change)
summary(model_combined_visits_cases_change_log)
##
## Call:
## lm(formula = change_log_cases_by_pop ~ visits_per_capita, data = ac_sf_sd_visits_cases_zip_change)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.52768 -0.28324 -0.05356 0.17231 2.07645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.13827 0.21617 0.640 0.5246
## visits_per_capita 0.02628 0.01106 2.377 0.0204 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4355 on 66 degrees of freedom
## Multiple R-squared: 0.07885, Adjusted R-squared: 0.06489
## F-statistic: 5.649 on 1 and 66 DF, p-value: 0.02037
Still hardly significant, actually.
Will just use a couple days to test.
bay_origins_hourly_3_9_day <-readRDS(paste0(sg_hourly_path, "ByDay/bay_patterns_2020-03-09_origins_hourly_day_2020-03-09.rds"))
bay_origins_hourly_3_10_day <-readRDS(paste0(sg_hourly_path, "ByDay/bay_patterns_2020-03-09_origins_hourly_day_2020-03-10.rds"))
bay_origins_hourly_3_9_10 <- rbind(bay_origins_hourly_3_9_day %>%
dplyr::select(safegraph_place_id, date, hour, visit_counts_hourly_high, visit_counts_hourly_low, origin_census_block_group), bay_origins_hourly_3_10_day %>%
dplyr::select(safegraph_place_id, date, hour, visit_counts_hourly_high, visit_counts_hourly_low, origin_census_block_group)) %>%
filter(!is.na(visit_counts_hourly_high))
weights_denom <- 15
bay_hourly_place_weights_3_9_10 <- bay_origins_hourly_3_9_10 %>%
group_by(safegraph_place_id, hour, date) %>%
summarize(total_visits_hourly_high = sum(visit_counts_hourly_high), total_visits_hourly_low = sum(visit_counts_hourly_low)) %>%
mutate(total_visits_hourly_avg = (total_visits_hourly_high + total_visits_hourly_low) / 2,
visits_weight = total_visits_hourly_avg / weights_denom) %>%
ungroup()
# get weighted visits by block group
bay_origins_hourly_3_9_10_weighted <- left_join(bay_origins_hourly_3_9_10, bay_hourly_place_weights_3_9_10 %>% dplyr::select(safegraph_place_id, hour, date, visits_weight)) %>%
group_by(origin_census_block_group, date) %>%
summarize(total_visits_high = sum(visit_counts_hourly_high * visits_weight), total_visits_low = sum(visit_counts_hourly_low * visits_weight)) %>%
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)
# compare to unweighted visits
bay_origins_hourly_3_9_10_unweighted <- bay_origins_hourly_3_9_10 %>%
group_by(origin_census_block_group, date) %>%
summarize(total_visits_high = sum(visit_counts_hourly_high), total_visits_low = sum(visit_counts_hourly_low)) %>%
mutate(total_visits_avg = (total_visits_high + total_visits_low)/2)
# plot both so we can see how they correlate
plot_ly() %>%
add_trace(x = bay_origins_hourly_3_9_10_unweighted$total_visits_avg, y = bay_origins_hourly_3_9_10_weighted$total_visits_avg, type = 'scatter', mode = 'markers') %>%
layout(title = "Comparing weighted and unweighted visits for all Bay Area origins", xaxis = list(title = "Unweighted daily visit-hours"), yaxis = list(title = "Weighted daily visit-hours"))
Old code, not used.
# # get total daily visits and cases
# sf_visits_cases_by_date <- sf_daily_visits_zip %>%
# group_by(date) %>%
# summarize(total_visits_high = sum(total_visits_high), total_visits_low = sum(total_visits_low)) %>%
# mutate(total_visits_avg = (total_visits_high + total_visits_low)/2) %>%
# left_join(sf_cases_zip %>% filter(!is.na(total_pop_zip)) %>% group_by(date) %>% summarize(total_cases = sum(confirmed_cases), total_cases_by_pop = sum(cases_by_pop), total_pop = sum(total_pop_zip))) %>%
# mutate(visits_per_capita = total_visits_avg/total_pop, change_cases_by_pop = c(NA, diff(total_cases_by_pop)))
#
# # get total daily visits and cases for alameda
# ac_visits_cases_by_date <- ac_daily_visits_zip %>%
# group_by(date) %>%
# summarize(total_visits_high = sum(total_visits_high), total_visits_low = sum(total_visits_low)) %>%
# mutate(total_visits_avg = (total_visits_high + total_visits_low)/2) %>%
# left_join(ac_cases_zip %>% filter(!is.na(total_pop_zip)) %>% group_by(date) %>% summarize(total_cases = sum(cases), total_cases_by_pop = sum(cases_by_pop), total_pop = sum(total_pop_zip))) %>%
# mutate(visits_per_capita = total_visits_avg/total_pop, change_cases_by_pop = c(NA, diff(total_cases_by_pop)))
#
# # plot
# ay <- list(
# overlaying = "y",
# side = "right",
# title = "visits per person"
# )
#
# sf_visits_cases_by_date %>% plot_ly() %>%
# add_trace(x = ~date, y = ~total_cases_by_pop, mode = 'lines', name = 'cases') %>%
# add_trace(x = ~date, y = ~visits_per_capita, mode = 'lines', yaxis = "y2", name = 'visits') %>%
# layout(title = "San Francisco", yaxis2 = ay)
#
# # plot with daily change in cases
# sf_visits_cases_by_date %>% plot_ly() %>%
# add_trace(x = ~date, y = ~change_cases_by_pop, mode = 'lines', name = 'cases') %>%
# add_trace(x = ~date, y = ~visits_per_capita, mode = 'lines', yaxis = "y2", name = 'visits') %>%
# layout(title = "San Francisco, daily change in cases", yaxis2 = ay)